{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Data"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "- https://vk.com/u_samovaraa?w=wall-81871567_61842\n",
    "- https://vk.com/vkys_nos?w=wall-41960737_13333\n",
    "- https://vk.com/receptik_kulinar?w=wall-59496329_52708\n",
    "- https://vk.com/lisimnik_cake?w=wall-82240292_25648\n",
    "- https://vk.com/kingcook?w=wall-59442940_11047\n",
    "- https://vk.com/u_samovaraa?w=wall-81871567_61917\n",
    "- https://vk.com/quickrecipes?w=wall-61337543_5892\n",
    "- https://vk.com/namenuru?w=wall-36303114_56579\n",
    "- https://vk.com/vegan_cookbook?w=wall-43818640_25903\n",
    "- https://vk.com/multivarka_cook?w=wall-51300483_11948"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from glob import iglob as list_paths\n",
    "\n",
    "\n",
    "def load_text(path):\n",
    "    with open(path) as file:\n",
    "        return file.read()\n",
    "\n",
    "\n",
    "texts = sorted(\n",
    "    load_text(_)\n",
    "    for _ in list_paths('texts/*.txt')\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "def make_translation(source, target):\n",
    "    assert len(source) == len(target)\n",
    "    return {\n",
    "        ord(a): ord(b)\n",
    "        for a, b in zip(source, target)\n",
    "    }\n",
    "\n",
    "\n",
    "DASHES_TRANSLATION = make_translation(\n",
    "    '‑–—−',\n",
    "    '----'\n",
    ")\n",
    "\n",
    "\n",
    "def preprocess(text):\n",
    "    text = text.replace('\\xa0', ' ')\n",
    "    text = text.replace('\\xad', '')\n",
    "    text = text.translate(DASHES_TRANSLATION)\n",
    "    return text\n",
    "\n",
    "\n",
    "texts = [preprocess(_) for _ in texts]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\"Баранки\"\n",
      "\n",
      "Баранки хорошо подходят к чаю, кофе и молоку. У изделий хрустящая корочка и рассыпчатый мякиш. Идеальны для детей как сладкое лакомство.\n",
      "Совет шеф-повара: Полейте баранки жидким шоколадом или посыпьте сахарной пудрой. Добавьте в тесто обжаренные кунжутные семечки, это придаст выпечке легкий восточный колорит.\n",
      "\n",
      "Количество порций - 16 шт. (по 60 г)\n",
      "Время приготовления - 45 мин\n",
      "\n",
      "Ингредиенты:\n",
      "\n",
      "450 г пшеничной муки\n",
      "2 яйца\n",
      "50 мл растительного масла\n",
      "200 мл молока + 20 мл на смазывание\n",
      "10 ч. л. сахара\n",
      "1 ч. л. соды\n",
      "1 ч. л. соли\n",
      "15 г муки на подпыл\n",
      "\n",
      "Еще больше рецептов в приложении для iPad и iPhone \"Готовят все\" https://itunes.apple.com/ru/app/id587314984\n"
     ]
    }
   ],
   "source": [
    "print(texts[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "\n",
    "\n",
    "def find_ingredient_sections(text):\n",
    "    return re.findall(r'ингредиенты:(.+?)приготовление', text, re.I | re.S)\n",
    "\n",
    "\n",
    "\n",
    "def maybe_ingredient(line):\n",
    "    match = re.search(r'\\d', line)\n",
    "    size = len(line) <= 50\n",
    "    return match and size\n",
    "\n",
    "\n",
    "lines = []\n",
    "for text in texts:\n",
    "    sections = find_ingredient_sections(text)\n",
    "    for section in sections:\n",
    "        for line in section.splitlines():\n",
    "            if maybe_ingredient(line):\n",
    "                lines.append(line)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['Майонез - 2 ст. л. ',\n",
       " 'Яйцо - 1 шт.',\n",
       " '1 ст. л. уксуса',\n",
       " '● сахар - 1/2 стакана;',\n",
       " ' Сахар - 3 ст.ложки,',\n",
       " '100 г мякоти тыквы ',\n",
       " 'Огурец - 1 шт. ',\n",
       " '85 гр зелени',\n",
       " '- 1 стакан пшеничной муки',\n",
       " ' Черная смородина (у меня замороженная) - 1 ст.',\n",
       " 'морковь - 1 кг ',\n",
       " '1/2 ч.ложки сахара',\n",
       " '● 1 ч.л. сухих дрожжей (или 12 г свежих)',\n",
       " '3-4 зубчика чеснока ',\n",
       " '- Огурец свежий 2 шт.',\n",
       " '120 мл оливкового масла плюс 2 ст. ложки',\n",
       " 'Яйцо - 3 шт ',\n",
       " '● сметана - 200 г;',\n",
       " '- мука 2 стакана ',\n",
       " '* 3/4 стакана холодной кипяченой воды']"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from random import seed, sample\n",
    "\n",
    "seed(1)\n",
    "sample(lines, 20)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "4054"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(lines)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Grammar"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[MorphToken(\n",
       "     value='рулетики',\n",
       "     span=[0, 8),\n",
       "     type='RU',\n",
       "     forms=[Form('рулетик', Grams(NOUN,inan,masc,nomn,plur)),\n",
       "      Form('рулетик', Grams(NOUN,accs,inan,masc,plur)),\n",
       "      Form('рулетик', Grams(NOUN,inan,masc,nomn,plur)),\n",
       "      Form('рулетик', Grams(NOUN,accs,inan,masc,plur)),\n",
       "      Form('рулётик', Grams(NOUN,inan,masc,nomn,plur)),\n",
       "      Form('рулётик', Grams(NOUN,accs,inan,masc,plur))]\n",
       " )]"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from yargy.tokenizer import MorphTokenizer\n",
    "\n",
    "TOKENIZER = MorphTokenizer()\n",
    "list(TOKENIZER('рулетики'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "# %run -n main.py\n",
    "# parser = Parser(INGREDIENT)\n",
    "# seed(1)\n",
    "# for line in sample(lines, 100):\n",
    "#     matches = list(parser.findall(line))\n",
    "#     spans = [_.span for _ in matches]\n",
    "#     show_markup(line, spans)\n",
    "# #     if matches:\n",
    "# #         match = matches[0]\n",
    "# #         display(match.tree.as_dot)\n",
    "# #         display(match.fact)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "language_info": {
   "name": "python"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
